package third.spider.parser;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import third.JsoupTools;

import java.util.*;

/**
 * a标签解析 <br>
 *
 * @author zengjian
 * @create 2018-07-10 16:25
 * @since 1.0.0
 */
public class ATagParser {

    /**
     * 获得a标签href地址集合，
     *
     * @param html
     * @return
     */
    public static Map<String, String> parse(String html) {
        return JsoupTools.parse(html, "a[href]", "href");
    }

    public static Collection<String> parseUrl(String html){
        return parse(html).values();
    }

    public static Map<String,String> parseMap(String html, String baseUrl){
        if (html == null || html.trim().equals("")) {
            return null;
        }
        Document document = Jsoup.parse(html);
        Elements elements = document.select("a[href]");
        Map<String, String> map = new LinkedHashMap<>();
        for (Element element : elements) {
            String text = element.text();
            String url = element.attr("href");
            if (url == null || "".equals(url)
                    || url.startsWith("javascript")
                    || url.startsWith("#")
                    || url.startsWith("mailto")) {
                continue;
            }
            if (url.startsWith("/")) {
                url = mergeUrl(url, baseUrl);
            }
            // url地址在前
            map.put(url, text);
        }
        return map;
    }


    /**
     * 取得该页面上所有的a标签url地址
     *
     * @param html
     * @return
     */
    public static List<String> parseUrl(String html, String baseUrl) {
        List<String> urlResults = new ArrayList<>();
        if (html == null || html.trim().equals("")) {
            return urlResults;
        }
        Document document = Jsoup.parse(html);
        Elements elements = document.select("a[href]");
        for (Element element : elements) {
            String url = element.attr("href");
            if (url == null || "".equals(url)
                    || url.startsWith("javascript")
                    || url.startsWith("#")
                    || url.startsWith("mailto")) {
                continue;
            }
            // 路径合并
            if (url.startsWith("/")) {
                url = mergeUrl(url, baseUrl);
            }
            // 在这里判断，因为存在相对路径的情况
            if (urlResults.contains(url)){
                continue;
            }
            urlResults.add(url);
        }
        return urlResults;
    }


    /**
     * 合并相对路径 <br>
     *
     * 1. 相对路径：/cn/reginit.doAction 源路径：http://www.infoq.com/cn/
     * 2. 相对路径：/privacypolicy 源路径：http://www.infoq.com/cn/
     * 3. 相对路径：www.infoq.com/cn/reginit.doAction 源路径：http://www.infoq.com/cn
     * @param url
     * @param baseUrl
     * @return
     */
    public static String mergeUrl(String url, String baseUrl) {
        if (baseUrl == null || url == null || "".equals(baseUrl.trim()) || "".equals(url.trim())) {
            return baseUrl;
        }
        // 计算重叠部分，并且进行去重处理，
        int count = 0;
        int baseLength = baseUrl.length();
        int urlLength = url.length();
        if (urlLength <= baseLength) {
            int start = baseLength - urlLength;
            while (true) {
                if (!baseUrl.substring(start + count).equals(url.substring(0, urlLength - count))) {
                    count++;
                    if (count == urlLength || count == baseLength) {
                        break;
                    }
                } else {
                    break;
                }
            }
            url = count == urlLength ? baseUrl + url : baseUrl + url.substring(urlLength - count);
            return url;
        } else {
            while (true) {
                if (!baseUrl.substring(count).equals(url.substring(0, baseLength - count))) {
                    count++;
                    if (count == baseLength) {
                        break;
                    }
                } else {
                    break;
                }
            }
            url = count == baseLength ? baseUrl + url : baseUrl.substring(0, count) + url;
            return url;
        }
    }

    public static Map<String, String> parse(String html, String cssQuery, String attr) {
        return JsoupTools.parse(html, cssQuery, attr);
    }
}
